# Module 1 Exercise — Introduction to R and Data Science
# Based on R4DS Chapters 1, 4, 6, and 8
# Practice R fundamentals, vectors, workflow, and project organization

# ===========================
# SETUP: Load Tidyverse and Clinical Packages
# ===========================

# Load the core packages (install first if needed: install.packages(c("tidyverse", "haven", "lubridate")))
library(tidyverse)  # Loads dplyr, tibble, readr, ggplot2, and more
library(haven)      # SAS file I/O
library(lubridate)  # Date handling

# Check your working directory and R version
cat("Working Directory:", getwd(), "\n")
cat("R Version:", R.version.string, "\n")

# ===========================
# EXERCISE 1: Working with Vectors (R4DS Ch. 4)
# ===========================

# Create atomic vectors for clinical data
# Logical vectors for flags
safety_population <- c(TRUE, TRUE, FALSE, TRUE, TRUE)
elderly_flag <- # YOUR CODE HERE: Create logical vector (FALSE, TRUE, FALSE, TRUE, FALSE)

# Numeric vectors for measurements
age <- c(25, 45, 67, 52, 71)
weight <- # YOUR CODE HERE: Create weight vector (65.2, 78.5, 85.1, 72.8, 90.3)
height <- # YOUR CODE HERE: Create height vector (165, 170, 160, 175, 168)

# Character vectors for IDs and categories
usubjid <- c("001-001", "001-002", "001-003", "001-004", "001-005")
treatment <- # YOUR CODE HERE: Create treatment vector ("Placebo", "Drug A", "Drug A", "Placebo", "Drug A")

# Vector operations
bmi <- # YOUR CODE HERE: Calculate BMI using weight / ((height/100)^2)
age_group <- # YOUR CODE HERE: Use ifelse() to create "Elderly" (>=65) vs "Adult" (<65)

# Display your vectors
cat("Ages:", age, "\n")
cat("BMI:", round(bmi, 1), "\n")
cat("Age Groups:", age_group, "\n")

# ===========================
# EXERCISE 2: Create Clinical Dataset with Tibbles
# ===========================

# Use your vectors from Exercise 1 to create a tibble
dm <- tibble(
  USUBJID = usubjid,
  AGE = age,
  WEIGHT = weight,
  HEIGHT = height,
  SEX = c("F", "M", "F", "M", "F"),
  TRT01A = treatment,
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19"),
  SAFFL = safety_population,
  BMI = round(bmi, 1),
  AGEGR1 = age_group
)

# Display the dataset
dm

# Use different methods to explore your dataset:
# 1. Use View() to open the data viewer (try this in RStudio!)
# View(dm)

# 2. Use glimpse() to see the structure
glimpse(dm)

# 3. Use summary() to get summary statistics
summary(dm)

# ===========================
# EXERCISE 3: RStudio Workflow Practice (R4DS Ch. 6)
# ===========================

# Practice RStudio features:
# 1. Type "dm$" and use Tab to see auto-completion options
# YOUR CODE HERE: dm$

# 2. Get help for a function
# YOUR CODE HERE: Use ?mutate to see help documentation

# 3. Use keyboard shortcuts
# Try Ctrl+Enter (Cmd+Enter) to run individual lines
# Try Ctrl+Shift+Enter (Cmd+Shift+Enter) to run entire script

# Practice code organization - add comments explaining clinical context
dm <- dm %>%
  mutate(
    # Create elderly flag based on protocol definition (age >= 65)
    ELDERLY = ifelse(AGE >= 65, "Y", "N"),

    # Convert character date to proper Date format for calculations
    RFSTDT = ymd(RFSTDTC),

    # Create BMI categories following clinical guidelines
    BMICAT = case_when(
      BMI < 18.5 ~ "Underweight",
      BMI < 25 ~ "Normal",
      BMI < 30 ~ "Overweight",
      BMI >= 30 ~ "Obese"
    )
  )

# Display the updated dataset
dm

# ===========================
# EXERCISE 4: Project Organization (R4DS Ch. 8)
# ===========================

# Understanding file paths and project structure
cat("Current working directory:", getwd(), "\n")

# In a real clinical programming project, you would organize files like this:
cat("\nRecommended project structure:\n")
cat("my_clinical_study/\n")
cat("├── my_clinical_study.Rproj\n")
cat("├── data/\n")
cat("│   ├── raw/           # Original data files\n")
cat("│   ├── sdtm/          # SDTM datasets\n")
cat("│   └── adam/          # ADAM datasets\n")
cat("├── programs/\n")
cat("│   ├── sdtm/          # SDTM creation programs\n")
cat("│   ├── adam/          # ADAM creation programs\n")
cat("│   └── tlf/           # Tables, listings, figures programs\n")
cat("└── outputs/           # Generated outputs\n\n")

# Script organization best practices
# YOUR CODE HERE: Add a comment explaining what the next code block does

# Filter to safety population and create summary
safety_summary <- dm %>%
  filter(SAFFL == TRUE) %>%               # Keep only safety population
  select(USUBJID, AGE, SEX, TRT01A, BMICAT) %>%  # Select key variables
  arrange(AGE)                            # Sort by age

# Display the summary
safety_summary

# ===========================
# EXERCISE 5: Data Summarization Practice
# ===========================

# Use group_by() and summarise() to answer these questions:

# 1. How many subjects are in each BMI category?
bmi_summary <- dm %>%
  # YOUR CODE HERE: group_by() and summarise() to count subjects in each BMICAT

# 2. What is the mean age by treatment group?
age_by_treatment <- dm %>%
  # YOUR CODE HERE: group_by() and summarise() to calculate mean age by TRT01A

# 3. How many elderly vs non-elderly subjects are there?
elderly_summary <- dm %>%
  # YOUR CODE HERE: count() subjects by ELDERLY flag

# Display your summaries
cat("BMI Category Summary:\n")
print(bmi_summary)

cat("\nAge by Treatment:\n")
print(age_by_treatment)

cat("\nElderly Summary:\n")
print(elderly_summary)

# ===========================
# EXERCISE 6: Getting Help and AI Assistance
# ===========================

# Practice using R's help system:
# 1. Run ?mean in the console to see help for the mean function
# 2. Run ??vector to search for vector-related functions
# 3. Run example(tibble) to see examples of tibble usage

# Try GitHub Copilot practice (if available in your RStudio):
# Type these comments and see what Copilot suggests:

# Create a flag for subjects under 30 years old
young_flag <- # YOUR CODE HERE: Use ifelse() to create flag for AGE < 30


# Calculate days since study start for each subject
study_days <- # YOUR CODE HERE: Use as.numeric() and date arithmetic

# Create treatment duration in weeks (example calculation)
# Assume treatment duration is 12 weeks for all subjects
treatment_weeks <- 12
total_days <- treatment_weeks * 7

cat("Example calculations:\n")
cat("Treatment duration:", treatment_weeks, "weeks =", total_days, "days\n")

# ===========================
# EXERCISE 7: String Manipulation and Final Summary
# ===========================

# Use stringr functions to extract site information
dm <- dm %>%
  mutate(
    # Extract site number from USUBJID (part before the dash)
    SITE = str_extract(USUBJID, "\\d{3}"),  # Extracts first 3 digits

    # Create formatted subject label
    SUBJ_LABEL = paste0("Subject ", USUBJID, " (Age: ", AGE, ")")
  )

# Display final dataset
cat("Final demographics dataset:\n")
print(dm)

# ===========================
# BONUS: Create a Simple Summary Report
# ===========================

# Create a text summary of your dataset
cat("=== DEMOGRAPHICS SUMMARY ===\n")
cat("Total subjects:", nrow(dm), "\n")
cat("Age range:", min(dm$AGE), "to", max(dm$AGE), "\n")
cat("Female subjects:", sum(dm$SEX == "F"), "\n")
cat("Male subjects:", sum(dm$SEX == "M"), "\n")
cat("Elderly subjects (65+):", sum(dm$ELDERLY == "Y"), "\n")

# ===========================
# EXERCISE COMPLETE!
# ===========================

cat("\n🎉 Congratulations! You've completed Module 1 exercises!\n")
cat("\nYou practiced:\n")
cat("✅ Data science process and clinical applications (R4DS Ch. 1)\n")
cat("✅ Working with vectors - R's fundamental data structures (R4DS Ch. 4)\n")
cat("✅ RStudio workflow and interface navigation (R4DS Ch. 6)\n")
cat("✅ Script organization and project management (R4DS Ch. 8)\n")
cat("✅ Creating tibbles and data manipulation with tidyverse\n")
cat("✅ Getting help and using AI assistance effectively\n")
cat("\n🚀 Ready for Module 2: Data Manipulation with dplyr!\n")
cat("Next you'll learn the five key verbs: filter, select, mutate, arrange, summarise\n")
